knitr::opts_chunk$set(message = FALSE, warning = FALSE)
#devtools::install_github("sdam-au/sdam") # loading SDAM custom package, if not working try devtools::install_github("mplex/cedhar", subdir="pkg/sdam")
#devtools::install_github("mplex/cedhar", subdir="pkg/sdam")
library(tidyverse)
#library(sdam)
library(jsonlite)
library(leaflet)
list_json <- jsonlite::fromJSON("EDH_text_cleaned_2021-01-21.json")
EDH_tibble <- as_tibble(list_json)
dir.create("../figures")
Display the first 6 records
head(EDH_tibble)
When I have the pointers how to get the data out of ‘people’ I will be looking at the following specific cases:
Specific case (funerary inscriptions; attribute ‘type_of_inscription_clean’ == ‘epitaph’) 1. How many people are on funerary inscriptions (total, average, min, max) 2. What is the ratio of genders on funerary inscriptions (male, female, NA) 3. What is the age of people on funerary inscriptions (total number of inscriptions with age, average, min, max) 4. What is the average age of people on funerary inscriptions by province
Specific case (gender composition) 1. Ratio of men/women on different types of inscriptions (attribute ‘type_of_inscription_clean’)
EDH_tibble$people[1:2]
## [[1]]
## name cognomen nomen person_id gender praenomen
## 1 Noniae P.f. Optatae Optata Nonia 1 female <NA>
## 2 C. Iulio Artemoni Artemo Iulius 2 male C.
## 3 C. Iulius C.f. Optatus Optatus Iulius 3 male C.
##
## [[2]]
## nomen praenomen person_id age: years cognomen gender name
## 1 Sextius C. 1 70 Paris male C. Sextius Paris
EDH_unnested<- EDH_tibble %>%
unnest(people)
setdiff(names(EDH_unnested), names(EDH_tibble))
## [1] "name" "cognomen" "nomen" "person_id" "gender"
## [6] "praenomen" "age: years" "tribus" "status" "occupation"
## [11] "origo" "age: days" "age: months" "supernomen" "age: hours"
One way through gender
EDH_unnested %>%
count(gender, sort = TRUE) -> gender
sum(gender$n)
## [1] 92427
Second way through nrow
nrow(EDH_unnested)
## [1] 92427
summary(as.numeric(EDH_unnested$person_id))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 1.000 2.000 4.233 3.000 244.000
EDH_unnested %>%
count(gender, sort = TRUE)
EDH_unnested$status %>%
unique()
## [1] NA
## [2] "senatorial order"
## [3] "slaves"
## [4] "freedmen / freedwomen"
## [5] "freedmen / freedwomen?"
## [6] "slaves?"
## [7] "senatorial order?"
## [8] "decurial order, higher local offices"
## [9] "military personnel"
## [10] "equestrian order"
## [11] "decurial order, higher local offices?"
## [12] "equestrian order?"
## [13] "military personnel?"
## [14] "Augustales"
## [15] "emperor / imperial household?"
## [16] "Augustales; freedmen / freedwomen"
## [17] "equestrian order?; decurial order, higher local offices"
## [18] "decurial order, higher local offices; freedmen / freedwomen"
## [19] "equestrian order; decurial order, higher local offices"
## [20] "lower local offices, administration of imperial estates"
## [21] "equestrian order; freedmen / freedwomen"
## [22] "decurial order, higher local offices; military personnel"
## [23] "Augustales?"
## [24] "equestrian order; military personnel"
## [25] "rulers (foreign)"
## [26] "senatorial order; equestrian order"
## [27] "emperor / imperial household; equestrian order"
## [28] "decurial order, higher local offices; Augustales"
## [29] "lower local offices, administration of imperial estates; freedmen / freedwomen"
## [30] "senatorial order; decurial order, higher local offices"
## [31] "equestrian order?; decurial order, higher local offices?"
## [32] "Augustales; decurial order, higher local offices"
## [33] "decurial order, higher local offices; military personnel?"
## [34] "lower local offices, administration of imperial estates?"
## [35] "decurial order, higher local offices; equestrian order"
## [36] "decurial order, higher local offices; Augustales?"
## [37] "emperor / imperial household; decurial order, higher local offices"
## [38] "decurial order, higher local offices?; lower local offices, administration of imperial estates"
## [39] "freedmen / freedwomen; military personnel"
## [40] "equestrian order; decurial order, higher local offices; military personnel"
## [41] "decurial order, higher local offices; lower local offices, administration of imperial estates"
## [42] "lower local offices, administration of imperial estates; military personnel"
## [43] "decurial order, higher local offices?; military personnel"
## [44] "equestrian order?; military personnel?"
## [45] "lower local offices, administration of imperial estates; Augustales"
## [46] "equestrian order; decurial order, higher local offices?"
## [47] "senatorial order?; equestrian order?"
## [48] "decurial order, higher local offices?; freedmen / freedwomen"
## [49] "Augustales?; freedmen / freedwomen"
## [50] "equestrian order?; lower local offices, administration of imperial estates"
str_split_fixed(EDH_unnested$status, ";", n=3) %>%
as.data.frame() -> status
status %>%
cbind(combined = c(status$V1,status$V2,status$V3)) %>%
filter(combined != "") %>%
mutate(combined_clean = str_replace_all(string = combined, pattern = "\\?", replacement = "")) %>%
mutate(combined_clean = str_replace_all(string = combined_clean, pattern = "^ ", replacement = "")) %>%
count(combined_clean, sort=TRUE) -> status_counts
status_counts
status_counts %>%
mutate(combined_clean = reorder(combined_clean, n)) %>%
ggplot(aes(y=combined_clean, x=n, fill=combined_clean)) +
geom_col(width=0.8, stat="identity") +
coord_cartesian(xlim=c(0,10000)) +
labs(x = "Number of instances", y = "Status category", title = "Overview of status references in the EDH dataset", subtitle = ggtitle(paste("n =", nrow(EDH_tibble), "inscriptions"))) +
geom_label(aes(label= n)) +
theme_linedraw(base_size = 12) +
theme_minimal()
ggsave("../EDH_people/figures/Status_overview.png", width = 12, height = 8)
EDH_unnested %>%
select('age: days', 'age: months', 'age: hours', 'age: years') %>%
filter(!is.na(EDH_unnested$`age: years`) | !is.na(EDH_unnested$`age: months`) | !is.na(EDH_unnested$`age: days`) |!is.na(EDH_unnested$`age: hours`))
unique(EDH_unnested$`age: years`)
## [1] NA "70" "42"
## [4] "18" "8" "at least 20"
## [7] "35" "34" "5"
## [10] "10" "25" "55"
## [13] "at least 10" "2" "1"
## [16] "64" "53" "36"
## [19] "29" "15" "40"
## [22] "30" "7" "75"
## [25] "4" "100" "19"
## [28] "50" "at least 21" "23"
## [31] "26" "27" "60"
## [34] "14" "24" "90"
## [37] "20" "43" "data not available"
## [40] "16" "37" "22"
## [43] "3" "at least 40" "45"
## [46] "61" "at least 50" "65"
## [49] "21" "85" "6"
## [52] "48" "57" "67"
## [55] "at least 16" "73" "41"
## [58] "33" "82" "52"
## [61] "80" "12" "17"
## [64] "38" "at least 35" "44"
## [67] "62" "68" "at least 30"
## [70] "9" "28" "at least 2"
## [73] "39" "at least 23" "11"
## [76] "at least 17" "at least 70" "32"
## [79] "13" "89" "at least 1"
## [82] "47" "51" "93"
## [85] "at least 15" "86" "87"
## [88] "at least 37" "72" "at least 5"
## [91] "at least 12" "63" "at least 3"
## [94] "31" "at least 8" "54"
## [97] "at least 9" "at least 26" "at least 25"
## [100] "46" "81" "59"
## [103] "103" "at least 60" "at least 61"
## [106] "76" "84" "at least 75"
## [109] "at least 19" "at least 22" "92"
## [112] "69" "at least 110" "102"
## [115] "71" "56" "at least 41"
## [118] "105" "83" "95"
## [121] "at least 28" "at least 6" "at least 11"
## [124] "at least 76" "99" "58"
## [127] "66" "78" "74"
## [130] "49" "at least 45" "77"
## [133] "at least 32" "at least 24" "91"
## [136] "120" "at least 65" "at least 31"
## [139] "at least 7" "110" "at least 27"
## [142] "150" "at least 51" "at least 55"
## [145] "at least 4" "at least 13" "at least 14"
## [148] "104" "at least 18" "at least 44"
## [151] "at least 101" "at least 74" "at least 42"
## [154] "108" "at least 90" "125"
## [157] "at least 77" "at least 43" "at least 46"
## [160] "at least 36" "at least 80" "200"
sum(!is.na(EDH_unnested$`age: years`))
## [1] 7993
EDH_unnested %>%
select('age: days', 'age: months', 'age: hours', 'age: years') %>%
filter(!is.na(EDH_unnested$`age: years`))
sum(!is.na(EDH_unnested$`age: months`))
## [1] 928
EDH_unnested %>%
select('age: days', 'age: months', 'age: hours', 'age: years') %>%
filter(!is.na(EDH_unnested$`age: months`))
sum(!is.na(EDH_unnested$`age: days`))
## [1] 689
EDH_unnested %>%
select('age: days', 'age: months', 'age: hours', 'age: years') %>%
filter(!is.na(EDH_unnested$`age: days`))
sum(!is.na(EDH_unnested$`age: hours`))
## [1] 24
EDH_unnested %>%
select('age: days', 'age: months', 'age: hours', 'age: years') %>%
filter(!is.na(EDH_unnested$`age: hours`))
Not ideal method as it skips a lot of textual descriptions
summary(as.numeric(EDH_unnested$`age: years`))
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1.00 18.00 30.00 33.66 50.00 200.00 85548
summary(as.numeric(EDH_unnested$`age: months`))
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1.00 3.00 6.00 5.91 8.00 30.00 91583
summary(as.numeric(EDH_unnested$`age: days`))
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1.0 7.0 13.0 14.5 20.0 100.0 91819
summary(as.numeric(EDH_unnested$`age: hours`))
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 3 4 6 6 8 11 92406
Better method using regular expressions to detect years and converting them as numeric
EDH_unnested %>%
select('age: days', 'age: months', 'age: hours', 'age: years') %>%
mutate(age_years = as.numeric(str_extract(EDH_unnested$'age: years', pattern = "[:digit:]+"))) %>%
mutate(age_months = as.numeric(str_extract(EDH_unnested$'age: months', pattern = "[:digit:]+"))) %>%
mutate(age_days = as.numeric(str_extract(EDH_unnested$'age: days', pattern = "[:digit:]+"))) %>%
mutate(age_hours = as.numeric(str_extract(EDH_unnested$'age: hours', pattern = "[:digit:]+"))) -> ages
summary(ages$age_years)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1.0 17.0 30.0 32.7 46.0 200.0 84958
summary(ages$age_months)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1.00 3.00 5.00 5.83 8.00 30.00 91546
summary(ages$age_days)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1.00 7.00 13.00 14.34 20.00 100.00 91780
summary(ages$age_hours)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 3.00 4.25 6.00 5.95 7.75 11.00 92405
ages <- ages %>%
mutate(months_to_years = age_months / 12) %>%
mutate(days_to_years = age_days / 365) %>%
mutate(hours_to_years = age_hours / (24*365))
ages <- ages %>%
replace_na(list(months_to_years = 0, days_to_years = 0, hours_to_years = 0)) %>%
mutate(total_age = age_years + months_to_years + days_to_years + hours_to_years) %>%
select(-ends_with("to_years"))
EDH_age<- EDH_unnested %>%
mutate(age_years = as.numeric(str_extract(EDH_unnested$'age: years', pattern = "[:digit:]+"))) %>%
mutate(age_months = as.numeric(str_extract(EDH_unnested$'age: months', pattern = "[:digit:]+"))) %>%
mutate(age_days = as.numeric(str_extract(EDH_unnested$'age: days', pattern = "[:digit:]+"))) %>%
mutate(age_hours = as.numeric(str_extract(EDH_unnested$'age: hours', pattern = "[:digit:]+"))) %>%
mutate(months_to_years = age_months / 12) %>%
mutate(days_to_years = age_days / 365) %>%
mutate(hours_to_years = age_hours / (24*365)) %>%
replace_na(list(months_to_years = 0, days_to_years = 0, hours_to_years = 0)) %>%
mutate(total_age = age_years + months_to_years + days_to_years + hours_to_years) %>%
select(-ends_with("to_years"))
summary(EDH_age$total_age)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1.00 17.00 30.00 32.75 46.00 200.00 84958
length(na.omit(EDH_age$total_age))/(nrow(EDH_age)/100)
## [1] 8.080972
EDH_unnested %>%
dplyr::filter(occupation == "data available") %>%
select()
Unfortunately, ant other details about the occupation are not provided.
origo<- as.data.frame(EDH_unnested$origo)
origo
origo %>%
filter(`EDH_unnested$origo` != "") %>%
#mutate(clean_origo = str_replace_all(string = combined, pattern = "\\?", replacement = "")) %>%
count(`EDH_unnested$origo`, sort=TRUE)
EDH_unnested %>%
filter(!is.na(origo)) %>%
count(type_of_inscription_clean, sort=T)
EDH_unnested %>%
filter(!is.na(origo) & type_of_inscription_clean == "epitaph") %>%
select(origo, findspot_ancient_clean) %>%
count(findspot_ancient_clean, origo, sort=T)
EDH_unnested %>%
filter(!is.na(origo)) -> has_origo
EDH_unnested<- EDH_unnested %>%
separate(col = coordinates, into = c("longitude", "latitude"), sep = ",")
EDH_unnested$latitude <- as.numeric(str_replace(EDH_unnested$latitude, pattern = "\\)", replacement=""))
EDH_unnested$longitude <- as.numeric(str_replace(EDH_unnested$longitude, pattern = "c\\(", replacement=""))
library(raster)
library(sf)
origo_mapped<- leaflet(width="100%") %>%
#addProviderTiles("Stamen.Watercolor")%>% # Add CartoDB map tiles
addProviderTiles("Stamen.TerrainBackground")%>% # Add CartoDB map tiles
#addProviderTiles("Esri.WorldTopoMap", group = "Topo") %>%
#addProviderTiles("Esri.WorldImagery", group = "ESRI Aerial") %>%
#setView( lng = 35.9239625, lat = 31.9515694, zoom = 5 ) %>%
#setMaxBounds(lat1=43.633977, lng1 =-11.227926 , lat2=35.133882 , lng2=50.882336) %>%
#addPolylines(data = roads, color = "purple", weight = 1, opacity = 0.7) %>%
addCircles(lng = EDH_unnested$longitude,
lat = EDH_unnested$latitude, opacity = 0.1, radius = 2, fill = TRUE, color = "red" , fillColor = "red",
) %>%
#addCircles(lng = xx$X,
# lat = xx$Y,
# opacity = 0.5, radius = 15, fill = TRUE, color = "red" , fillColor = "black", popup = paste0("<b> City: </b>", xx$Name)) %>%
addLegend(position = "bottomright",
colors = c("Red"),
labels = c("Inscriptions"), opacity = 1,
title = "Inscriptions with origo statement"
) %>%
addScaleBar(position="bottomleft")
origo_mapped
# https://www.supplychaindataanalytics.com/leaflet-heatmaps-in-r/
#install.packages("leaflet.extras")
library(leaflet.extras)
library(RColorBrewer)
heat_origo <- EDH_unnested %>%
leaflet(width="100%") %>%
addTiles() %>%
#addProviderTiles("Esri.WorldImagery", group = "ESRI Aerial") %>%
#addProviderTiles("Esri.WorldShadedRelief", group = "ESRI Aerial") %>%
#addProviderTiles("Stamen.TerrainBackground") %>%
addProviderTiles("Stamen.TonerBackground") %>%
#addProviderTiles("CartoDB.VoyagerNoLabels") %>%
setView( lng = 12.9239625, lat = 41.9515694, zoom = 4 ) %>%
#setMaxBounds(lat1=40.633977, lng1 =-4.227926 , lat2=35.133882 , lng2=40.882336) %>%
addHeatmap(lng = ~as.numeric(na.omit(EDH_unnested$longitude)), lat = ~as.numeric(na.omit(EDH_unnested$latitude)),
intensity = 0.1, layerId = NULL, group = NULL, minOpacity = 0.1, #max = 1,
radius = 2, blur = 3, gradient = "YlOrRd", cellSize = 1,
)
heat_origo
EDH_age %>%
filter(type_of_inscription_clean == "epitaph") -> epitaph
How many percent of people on funerary inscriptions state their age on inscriptions
length(na.omit(epitaph$total_age))/(nrow(epitaph)/100)
## [1] 15.51995
epitaph %>%
dplyr::select(total_age, province_label_clean) %>%
count(total_age, province_label_clean, sort=TRUE) %>%
ggplot(aes(x=total_age, y= fct_rev(province_label_clean))) + geom_point(alpha=0.5, color="darkblue") +
theme_minimal()
ggsave("../EDH_people/figures/Age_years_provinces.png", width = 8, height = 8)
install.packages("psych")
library(psych)
describeBy(epitaph$total_age, group = epitaph$province_label_clean, mat = TRUE, digits = 2) -> age_provinces
head(age_provinces)
age_provinces %>%
filter(n>0) %>%
mutate(group1 = reorder(group1, mean)) %>%
ggplot(aes(y=group1, x=mean)) +
geom_col(color="white", fill="blue", width=0.8, stat="identity") +
#coord_cartesian(xlim=c(0,80)) +
theme_minimal() +
theme(text = element_text(size=16)) +
labs(y="Roman province", x="Years", title= "Average age of people on inscriptions in the EDH database by province", subtitle = "n= 7993 people")+
#geom_label(aes(label= mean)) +
geom_label(aes(label = mean), colour = "black", fontface = "bold", hjust = -0.1)
ggsave("../figures/Age_average_years_provinces.png", width = 12, height = 12)
age_provinces %>%
filter(n>100) %>%
mutate(group1 = reorder(group1, mean)) %>%
ggplot(aes(y=group1, x=mean)) +
geom_col(color="white", fill="blue", width=0.8, stat="identity") +
#coord_cartesian(xlim=c(0,80)) +
theme_minimal() +
theme(text = element_text(size=16)) +
labs(y="Roman province ", x="Years", title= "Average age of people on inscriptions in the EDH database by province with more than 100 inscriptions stating age", subtitle = "n= 7993 people")+
#geom_label(aes(label= mean)) +
geom_label(aes(label = mean), colour = "black", fontface = "bold", hjust = -0.1)
ggsave("../figures/Age_average_years_provinces_100plus_inscr.png", width = 12, height = 12)
age_provinces %>%
filter(n>0) %>%
mutate(group1 = reorder(group1, n)) %>%
ggplot(aes(y=group1, x=n)) +
geom_col(color="white", fill="purple", width=0.8, stat="identity") +
#coord_cartesian(xlim=c(0,80)) +
theme_minimal() +
theme(text = element_text(size=16)) +
labs(y="Roman province", x="n", title= "Instances of age information in the EDH database by province", subtitle = "n= 7993 people")+
#geom_text(aes(label= n),hjust = -0.4) +
geom_label(aes(label = n), colour = "black", fontface = "bold", hjust = -0.4)
ggsave("../figures/Age_info_provinces.png", width = 12, height = 12)
EDH_age<- EDH_age %>%
mutate(agegroup = case_when(total_age < 3 ~ "0-2.99",
total_age < 15 ~ "3-14.99",
total_age < 30 ~ "15-29.99",
total_age < 40 ~ "30-39.99",
total_age < 60 ~ "40-59.99",
total_age > 60 ~ "over 60"))
EDH_age$agegroup <- factor(EDH_age$agegroup, levels = c("0-2.99", "3-14.99","15-29.99", "30-39.99", "40-59.99","over 60"))
EDH_age %>%
filter(agegroup != "NA") %>%
#count(agegroup, sort = TRUE) %>%
#mutate(agegroup_sorted = reorder(agegroup, n)) %>%
ggplot() +
geom_bar(mapping = aes(x = agegroup, fill = agegroup))+
labs(x = "Age group (years)", y = "Number of instances", title = "Representation of age groups on funerary inscriptions (EDH dataset)", subtitle = ggtitle(paste("n =", nrow(filter(EDH_age, agegroup != "NA")), "inscriptions")))
#+ geom_label(aes(label = agegroup), colour = "black", fontface = "bold", hjust = -0.4)
ggsave("../EDH_people/figures/Age_groups_epitaphs.png", width = 12, height = 8)
EDH_age<- EDH_age %>%
mutate(age10 = ifelse(total_age < 10, "under10", "over10"))
EDH_age %>%
count(age10, province_label_clean, sort=F) %>%
ggplot(aes(fill=age10, y=province_label_clean, x=n)) +
geom_bar(position="fill", stat="identity") +
theme_minimal() +
#theme(text = element_text(size=16)) +
labs(y="Roman province", x="n", title= "Ratio of age children under 10 years on inscriptions per province")
#geom_text(aes(label= n),hjust = -0.4)
#geom_label(aes(label = n), colour = "black", fontface = "bold", hjust = -0.4)
EDH_age10<- EDH_age %>%
count(age10, province_label_clean, sort=F) %>%
spread(key=age10, value=n)
EDH_under10<- EDH_age10 %>%
mutate(total = rowSums(EDH_age10[2:4], na.rm=TRUE)) %>%
mutate(under10_ratio = under10 / (total/100)) %>%
mutate(over10_ratio = over10 / (total/100)) %>%
mutate(age_stated = (over10+under10) / (total/100))
EDH_under10
Using not before and not after date separately.
describeBy(epitaph$total_age, group = epitaph$not_before, mat = TRUE, digits = 2) -> age_not_before
describeBy(epitaph$total_age, group = epitaph$not_after, mat = TRUE, digits = 2) -> age_not_after
age_not_bf_plot<- age_not_before %>%
ggplot(aes(x=group1, y=n)) +
geom_point() +
geom_vline(xintercept = 1, linetype="dotted",
color = "green", size=0.5) +
geom_vline(xintercept = 100, linetype="dotted",
color = "blue", size=0.5) +
geom_vline(xintercept = 200, linetype="dotted",
color = "red", size=0.5)
age_not_aft_plot<- age_not_after %>%
ggplot(aes(x=group1, y=n)) +
geom_point() +
geom_vline(xintercept = 1, linetype="dotted",
color = "green", size=0.5) +
geom_vline(xintercept = 100, linetype="dotted",
color = "blue", size=0.5) +
geom_vline(xintercept = 200, linetype="dotted",
color = "red", size=0.5)
Commentary: People state their age in the second century the most!
age_not_before %>%
filter(!is.na(mean)) %>%
ggplot(aes(x=as.numeric(group1), y=as.numeric(mean))) +
geom_point() +
geom_smooth() +
geom_vline(xintercept = 0, linetype="dotted",
color = "green", size=0.5) +
geom_vline(xintercept = 100, linetype="dotted",
color = "blue", size=0.5) +
geom_vline(xintercept = 200, linetype="dotted",
color = "red", size=0.5) +
geom_vline(xintercept = 300, linetype="dotted",
color = "brown", size=0.5)
age_not_after %>%
filter(!is.na(mean)) %>%
ggplot(aes(x=as.numeric(group1), y=as.numeric(mean))) +
geom_point() +
geom_smooth() +
geom_vline(xintercept = 0, linetype="dotted",
color = "green", size=0.5) +
geom_vline(xintercept = 100, linetype="dotted",
color = "blue", size=0.5) +
geom_vline(xintercept = 200, linetype="dotted",
color = "red", size=0.5) +
geom_vline(xintercept = 300, linetype="dotted",
color = "brown", size=0.5)
epitaph %>%
count(gender, sort=TRUE) %>%
mutate(ratio_total = n/(nrow(epitaph)/100)) %>%
mutate(ratio_total = reorder(ratio_total, n)) %>%
ggplot(aes(y=gender, x=n)) +
geom_col(color="white", fill="orange", width=0.8, stat="identity", fill=gender) +
coord_cartesian(xlim=c(0,30000)) +
theme_minimal() +
theme(text = element_text(size=14)) +
labs(y="Gender category", x="Number of instances", title= "Gender ratio on epitaphs in the EDH database", subtitle = "n = 47,803 inscriptions" ) +
geom_label(aes(label= n))
# + geom_text(aes(label = n), colour = "red", fontface = "bold", hjust = -0.1)
ggsave("../EDH_people/figures/Gender_total_epitaphs.png", width = 12, height = 12)
epitaph %>%
count(gender, sort=TRUE) %>%
mutate(ratio_total = n/(nrow(epitaph)/100)) %>%
mutate(ratio_total = round(ratio_total, digits = 2)) %>%
filter(ratio_total >0.5) %>%
ggplot(aes(y=gender, x=ratio_total, fill=gender)) +
geom_bar(width=0.8, stat="identity") +
coord_cartesian(xlim=c(0,60)) +
theme_minimal() +
theme(text = element_text(size=16)) +
labs(y="Gender category", x="%", title= "Gender ratio on epitaphs in the EDH database", subtitle = ggtitle(paste("n =", nrow(epitaph), "inscriptions" ))) +
geom_label(aes(label= ratio_total), hjust = -0.1)
#+ geom_text(aes(label = ratio_total))
ggsave("../EDH_people/figures/Gender_epitaphs.png", width = 8, height = 8)
epitaph %>%
dplyr::filter(gender == "male"| gender == "M?") -> epitaph_male
epitaph %>%
dplyr::filter(gender == "female"| gender == "F?") -> epitaph_female
gender_true_ratio <- as.data.frame(cbind(male=nrow(epitaph_male), female=nrow(epitaph_female)))
gender_true_ratio %>%
gather() %>%
rename(gender = key) %>%
rename(n = value) %>%
as.data.frame() -> gender_ratio
gender_ratio %>%
ggplot(aes(x=gender, y=n)) +
geom_col(color="white", fill="orange", width=0.5, stat="identity") +
theme_minimal() +
theme(text = element_text(size=14)) +
labs(x="Gender category", y="Number of people", title= "Gender ratio on epitaphs in the EDH database", subtitle = "n = 47,803 inscriptions" ) +
#geom_label(aes(label= n)) +
geom_label(aes(label = n), colour = "black", fontface = "bold", vjust = 0.5)
ggsave("../EDH_people/figures/Gender_total_ratio_epitaphs.png", width = 12, height = 12)
gender_ratio %>%
mutate(ratio = n/(sum(n))*100) %>%
mutate(ratio = round(ratio, digits =2)) %>%
ggplot(aes(x=gender, y=ratio), color=gender) +
geom_col(width=0.5, stat="identity", fill = "brown") +
theme_minimal() +
theme(text = element_text(size=14)) +
labs(x="Gender category", y="%", title= "Gender ratio on epitaphs in the EDH database", subtitle = "n = 47,803 inscriptions" ) +
#geom_label(aes(label= ratio)) +
geom_label(aes(label = ratio), colour = "black", fontface = "bold", vjust = 0.5)
ggsave("../EDH_people/figures/Gender_ratio_epitaphs.png", width = 12, height = 12)